#!/usr/bin/bash
#
# plugin for munin to monitor usage of NSD.
#
# (C) 2008 W.C.A. Wijngaards.  BSD Licensed.
#
# To install; compile with --enable-bind8-stats (enabled by default)
#	and enable nsd-control in nsd.conf with the line
#	remote-control:	control-enable: yes
# Run the command nsd-control-setup as root to generate the key files.
#
# Environment variables for this script
#	statefile	- where to put temporary statefile.
#	nsd_conf	- where the nsd.conf file is located.
#	nsd_control	- where to find nsd-control executable.
#	nsd_checkconf	- where to find nsd-checkconf executable.
#
# You can set them in your munin/plugin-conf.d/plugins.conf file
# with:
# [nsd_*]
# user root
# env.statefile /opt/local/var/munin/plugin-state/nsd-state
# env.nsd_conf /opt/local/etc/nsd.conf
# env.nsd_control /opt/local/sbin/nsd-control
# env.nsd_checkconf /opt/local/sbin/nsd-checkconf
#
# This plugin can create different graphs depending on what name
# you link it as (with ln -s) into the plugins directory
# You can link it multiple times.
# If you are only a casual user, the _hits and _by_type are most interesting,
# possibly followed by _by_rcode.
#
#	nsd_hits		- base volume, transport type, failures 
#	nsd_memory	- memory usage
#	nsd_by_type	- incoming queries by type
#	nsd_by_class	- incoming queries by class
#	nsd_by_opcode	- incoming queries by opcode
#	nsd_by_rcode	- answers by rcode
#	nsd_zones		- number of zones
#
# Magic markers - optional - used by installation scripts and
# munin-config:
#
#%# family=contrib
#%# capabilities=autoconf suggest

# POD documentation
: <<=cut
=head1 NAME

nsd_ - Munin plugin to monitor the NSD server.

=head1 APPLICABLE SYSTEMS

System with NSD daemon.

=head1 CONFIGURATION

  [nsd_*]
  user root
  env.statefile /opt/local/var/munin/plugin-state/nsd-state
  env.nsd_conf /opt/local/etc/nsd.conf
  env.nsd_control /opt/local/sbin/nsd-control
  env.nsd_checkconf /opt/local/sbin/nsd-checkconf

Use the .env settings to override the defaults.

=head1 USAGE

Can be used to present different graphs. Use ln -s for that name in
the plugins directory to enable the graph.
nsd_hits		- base volume, transport type, failures 
nsd_memory	- memory usage
nsd_by_type	- incoming queries by type
nsd_by_class	- incoming queries by class
nsd_by_opcode	- incoming queries by opcode
nsd_by_rcode	- answers by rcode
nsd_zones		- number of zones

=head1 AUTHOR

Copyright 2008 W.C.A. Wijngaards

=head1 LICENSE

BSD

=cut

state=${statefile:-/opt/local/var/munin/plugin-state/nsd-state}
conf=${nsd_conf:-/opt/local/etc/nsd/nsd.conf}
ctrl=${nsd_control:-/opt/local/sbin/nsd-control}
chkconf=${nsd_checkconf:-/opt/local/sbin/nsd-checkconf}
lock=$state.lock

# number of seconds between polling attempts.
# makes the statefile hang around for at least this many seconds,
# so that multiple links of this script can share the results.
lee=55

# to keep things within 19 characters
ABBREV="-e s/num/n/ -e s/type/t/ -e s/opcode/o/ -e s/rcode/r/ -e s/class/c/"

# get value from $1 into return variable $value
get_value ( ) {
	value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
	if test "$value"x = ""x; then
		value="0"
	fi
}

# download the state from NSD.
get_state ( ) {
	# obtain lock for fetching the state
	# because there is a race condition in fetching and writing to file

	# see if the lock is stale, if so, take it 
	if test -f $lock ; then
		pid="`cat $lock 2>&1`"
		kill -0 "$pid" >/dev/null 2>&1
		if test $? -ne 0 -a "$pid" != $$ ; then
			echo $$ >$lock
		fi
	fi

	i=0
	while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
		while test -f $lock; do
			# wait
			i=`expr $i + 1`
			if test $i -gt 1000; then
				sleep 1;
			fi
			if test $i -gt 1500; then
				echo "error locking $lock" "=" `cat $lock`
				rm -f $lock
				exit 1
			fi
		done
		# try to get it
		echo $$ >$lock
	done
	# do not refetch if the file exists and only LEE seconds old
	if test -f $state; then
		now=`date +%s`
		get_value "timestamp"
		if test $now -lt `expr $value + $lee`; then
			rm -f $lock
			return
		fi
	fi
	$ctrl -c $conf stats > $state
	if test $? -ne 0; then
		echo "error retrieving data from the server"
		rm -f $lock
		exit 1
	fi
	echo "timestamp="`date +%s` >> $state
	rm -f $lock
}

if test "$1" = "autoconf" ; then
	if test ! -f $conf; then
		echo no "($conf does not exist)"
		exit 1
	fi
	if test ! -d `dirname $state`; then
		mkdir -p `dirname $state`
		if test ! -d `dirname $state`; then
			echo no "($state directory does not exist)"
			exit 1
		fi
	fi
	echo yes
	exit 0
fi

if test "$1" = "suggest" ; then
	echo "hits"
	echo "memory"
	echo "by_type"
	echo "by_class"
	echo "by_opcode"
	echo "by_rcode"
	echo "zones"
	exit 0
fi

# determine my type, by name
id=`echo $0 | sed -e 's/^.*nsd_//'`
if test "$id"x = ""x; then
	# some default to keep people sane.
	id="hits"
fi

# if $1 exists in statefile, config is echoed with label $2
exist_config ( ) {
	mn=`echo $1 | sed $ABBREV | tr . _`
	if grep '^'$1'=' $state >/dev/null 2>&1; then
		echo "$mn.label $2"
		echo "$mn.min 0"
	fi
}

# print label and min 0 for a name $1 in nsd format
p_config ( ) {
	mn=`echo $1 | sed $ABBREV | tr . _`
	echo $mn.label "$2"
	echo $mn.min 0
}

if test "$1" = "config" ; then
	if test ! -f $state; then
		get_state
	fi
	case $id in
	hits)
		echo "graph_title NSD traffic"
		echo "graph_args --base 1000 -l 0"
		echo "graph_vlabel queries / second"
		echo "graph_category DNS"
		for x in server0.queries server1.queries server2.queries \
			server3.queries server4.queries server5.queries \
			server6.queries server7.queries server8.queries \
			server9.queries server10.queries server11.queries \
			server12.queries server13.queries server14.queries \
			server15.queries ; do
			exist_config $x "queries handled by `basename $x .queries`"
		done
		p_config "num.queries" "total queries"
		p_config "num.udp" "UDP ip4 queries"
		p_config "num.udp6" "UDP ip6 queries"
		p_config "num.tcp" "TCP ip4 queries"
		p_config "num.tcp6" "TCP ip6 queries"
		p_config "num.edns" "queries with EDNS OPT"
		p_config "num.ednserr" "queries failed EDNS parse"
		p_config "num.answer_wo_aa" "nonauthor. queries (referrals)"
		p_config "num.rxerr" "receive failed"
		p_config "num.txerr" "transmit failed"
		p_config "num.truncated" "truncated replies with TC"
		p_config "num.raxfr" "AXFR from allowed client"
		p_config "num.dropped" "dropped due to sanity check"
		echo "graph_info DNS queries."
		;;
	memory)
		echo "graph_title NSD memory usage"
		echo "graph_args --base 1024 -l 0"
		echo "graph_vlabel memory used in bytes"
		echo "graph_category DNS"
		p_config "size.vsz" "Total virtual memory (VSZ)"
		p_config "size.rss" "Total resident memory (RSS)"
		p_config "size.db.mem" "data in memory"
		p_config "size.xfrd.mem" "xfr and notify memory"
		p_config "size.config.mem" "config memory"
		p_config "size.db.disk" "mmap of nsd.db file"
		p_config "size.config.disk" "config zonelist on disk"
		echo "graph_info The memory used by NSD, xfrd and config. Disk size of nsd.db and zonelist."
		;;
	by_type)
		echo "graph_title NSD queries by type"
		echo "graph_args --base 1000 -l 0"
		echo "graph_vlabel queries / second"
		echo "graph_category DNS"
		for x in `grep "^num.type" $state`; do
			nm=`echo $x | sed -e 's/=.*$//'`
			tp=`echo $nm | sed -e s/num.type.//`
			p_config "$nm" "$tp"
		done
		echo "graph_info queries by DNS RR type queried for"
		;;
	by_class)
		echo "graph_title NSD queries by class"
		echo "graph_args --base 1000 -l 0"
		echo "graph_vlabel queries / second"
		echo "graph_category DNS"
		for x in `grep "^num.class" $state`; do
			nm=`echo $x | sed -e 's/=.*$//'`
			tp=`echo $nm | sed -e s/num.class.//`
			p_config "$nm" "$tp"
		done
		echo "graph_info queries by DNS RR class queried for."
		;;
	by_opcode)
		echo "graph_title NSD queries by opcode"
		echo "graph_args --base 1000 -l 0"
		echo "graph_vlabel queries / second"
		echo "graph_category DNS"
		for x in `grep "^num.opcode" $state`; do
			nm=`echo $x | sed -e 's/=.*$//'`
			tp=`echo $nm | sed -e s/num.opcode.//`
			p_config "$nm" "$tp"
		done
		echo "graph_info queries by opcode in the query packet."
		;;
	by_rcode)
		echo "graph_title NSD answers by return code"
		echo "graph_args --base 1000 -l 0"
		echo "graph_vlabel answer packets / second"
		echo "graph_category DNS"
		for x in `grep "^num.rcode" $state`; do
			nm=`echo $x | sed -e 's/=.*$//'`
			tp=`echo $nm | sed -e s/num.rcode.//`
			p_config "$nm" "$tp"
		done
		echo "graph_info answers split out by return value."
		;;
	zones)
		echo "graph_title NSD number of zones"
		echo "graph_args --base 1000 -l 0"
		echo "graph_vlabel zone count"
		echo "graph_category DNS"
		p_config "zone.total" "total zones"
		p_config "zone.master" "master zones"
		p_config "zone.slave" "slave zones"
		echo "graph_info number of zones served by NSD."
		;;
	esac

	exit 0
fi

# do the stats itself
get_state

# get the time elapsed
get_value "time.elapsed"
if test $value = 0 || test $value = "0.000000"; then
	echo "error: time elapsed 0 or could not retrieve data"
	exit 1
fi
elapsed="$value"

# print value for $1 / elapsed
print_qps ( ) {
	mn=`echo $1 | sed $ABBREV | tr . _`
	get_value $1
	echo "$mn.value" `echo scale=6';' $value / $elapsed | bc `
}

# print qps if line already found in $2
print_qps_line ( ) {
	mn=`echo $1 | sed $ABBREV | tr . _`
	value="`echo $2 | sed -e 's/^.*=//'`"
	echo "$mn.value" `echo scale=6';' $value / $elapsed | bc `
}

# print value for $1
print_value ( ) {
	mn=`echo $1 | sed $ABBREV | tr . _`
	get_value $1
	echo "$mn.value" $value
}

case $id in
hits)
	for x in server0.queries server1.queries server2.queries \
		server3.queries server4.queries server5.queries \
		server6.queries server7.queries server8.queries \
		server9.queries server10.queries server11.queries \
		server12.queries server13.queries server14.queries \
		server15.queries \
		num.queries num.udp num.udp6 num.tcp num.tcp6 \
		num.edns num.ednserr num.answer_wo_aa num.rxerr num.txerr \
		num.truncated num.raxfr num.dropped ; do
		if grep "^"$x"=" $state >/dev/null 2>&1; then
			print_qps $x
		fi
	done
	;;
memory)
	# get the total memory for NSD
	serverpid=`$ctrl -c $conf serverpid 2>&1`
	# small race condition, if reload happens between previous and next
	# lines, if so, detect by checking if we have a number as output.
	rssval=`ps -p $serverpid -o rss= 2>&1`
	vszval=`ps -p $serverpid -o vsz= 2>&1`
	if test "`expr $rssval + 1 - 1 2>&1`" -eq "$rssval" >/dev/null 2>&1; then
		rssval=`expr $rssval \* 1024` 
	else
		rssval=0
	fi
	if test "`expr $vszval + 1 - 1 2>&1`" -eq "$vszval" >/dev/null 2>&1; then
		vszval=`expr $vszval \* 1024` 
	else
		vszval=0
	fi
	echo "size_vsz.value" $vszval
	echo "size_rss.value" $rssval
	for x in size.db.mem size.xfrd.mem size.config.mem \
		size.db.disk size.config.disk; do
		print_value $x
	done
	;;
by_type)
	for x in `grep "^num.type" $state`; do
		nm=`echo $x | sed -e 's/=.*$//'`
		print_qps_line $nm $x
	done
	;;
by_class)
	for x in `grep "^num.class" $state`; do
		nm=`echo $x | sed -e 's/=.*$//'`
		print_qps_line $nm $x
	done
	;;
by_opcode)
	for x in `grep "^num.opcode" $state`; do
		nm=`echo $x | sed -e 's/=.*$//'`
		print_qps_line $nm $x
	done
	;;
by_rcode)
	for x in `grep "^num.rcode" $state`; do
		nm=`echo $x | sed -e 's/=.*$//'`
		print_qps_line $nm $x
	done
	;;
zones)
	get_value "zone.master"
	nummas="$value"
	get_value "zone.slave"
	numsla="$value"
	echo "zone_total.value" `expr $nummas + $numsla`
	echo "zone_master.value" "$nummas"
	echo "zone_slave.value" "$numsla"
esac
